Set up environment and load data into R
setwd("~/Documents/R/NYPD_MVC/")
mvc.raw <- read.csv(file = "NYPD_Motor_Vehicle_Collisions.csv", header = T)
Get info of the data
- Use str() function to check data structure
str(mvc.raw)
## 'data.frame': 333117 obs. of 29 variables:
## $ DATE : Factor w/ 559 levels "01/01/2015","01/01/2016",..: 351 80 80 80 80 80 80 80 80 80 ...
## $ TIME : Factor w/ 1440 levels "0:00","0:01",..: 506 807 811 816 821 836 837 881 886 912 ...
## $ BOROUGH : Factor w/ 6 levels "","BRONX","BROOKLYN",..: 1 3 4 1 1 5 1 1 1 2 ...
## $ ZIP.CODE : int NA 11210 10029 NA NA 11419 NA NA NA 10466 ...
## $ LATITUDE : num 40.7 40.6 40.8 40.6 NA ...
## $ LONGITUDE : num -74 -74 -73.9 -74.1 NA ...
## $ LOCATION : Factor w/ 51604 levels "","(40.4991346, -74.2434848)",..: 23187 9531 41437 5466 1 20818 1 1 1 50914 ...
## $ ON.STREET.NAME : Factor w/ 7275 levels "","?EST 125 STREET",..: 1 3349 2669 1 6424 149 3893 1032 1 2840 ...
## $ CROSS.STREET.NAME : Factor w/ 7843 levels "","01247","043 PCT",..: 1 3088 4720 1 6776 1026 46 2724 1 3348 ...
## $ OFF.STREET.NAME : Factor w/ 21951 levels "","(26 BROOKLYN TERMINAL MARKET LOT)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ NUMBER.OF.PERSONS.INJURED : int 1 0 0 0 0 1 0 0 2 0 ...
## $ NUMBER.OF.PERSONS.KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBER.OF.PEDESTRIANS.INJURED: int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBER.OF.PEDESTRIANS.KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBER.OF.CYCLIST.INJURED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBER.OF.CYCLIST.KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBER.OF.MOTORIST.INJURED : int 1 0 0 0 0 1 0 0 2 0 ...
## $ NUMBER.OF.MOTORIST.KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CONTRIBUTING.FACTOR.VEHICLE.1: Factor w/ 48 levels "","Accelerator Defective",..: 46 46 22 43 46 34 35 46 15 6 ...
## $ CONTRIBUTING.FACTOR.VEHICLE.2: Factor w/ 48 levels "","Accelerator Defective",..: 46 46 20 46 46 46 35 46 15 46 ...
## $ CONTRIBUTING.FACTOR.VEHICLE.3: Factor w/ 41 levels "","Accelerator Defective",..: 40 1 1 1 1 1 1 1 40 1 ...
## $ CONTRIBUTING.FACTOR.VEHICLE.4: Factor w/ 37 levels "","Accelerator Defective",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CONTRIBUTING.FACTOR.VEHICLE.5: Factor w/ 24 levels "","Aggressive Driving/Road Rage",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ UNIQUE.KEY : int 3467644 3386830 3386649 3387085 3386424 3386935 3386475 3386473 3386684 3386719 ...
## $ VEHICLE.TYPE.CODE.1 : Factor w/ 18 levels "","AMBULANCE",..: 10 10 10 14 15 10 10 15 10 15 ...
## $ VEHICLE.TYPE.CODE.2 : Factor w/ 18 levels "","AMBULANCE",..: 10 15 16 10 1 10 15 15 14 15 ...
## $ VEHICLE.TYPE.CODE.3 : Factor w/ 18 levels "","AMBULANCE",..: 10 1 1 1 1 1 1 1 10 1 ...
## $ VEHICLE.TYPE.CODE.4 : Factor w/ 17 levels "","AMBULANCE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ VEHICLE.TYPE.CODE.5 : Factor w/ 14 levels "","BICYCLE","BUS",..: 1 1 1 1 1 1 1 1 1 1 ...
- The NYPD_Motor_Vehicle_Collisions.csv has lots of empty entries and NAs in cells.
- Some columns are charactor but were assigned to factor.
- There are 333117 observations in the data.
Clean up data
- Subset data by latitude and longitude
mvc.xy <- mvc.raw[!is.na(mvc.raw$LATITUDE) & !is.na(mvc.raw$LONGITUDE), c(1, 2, 5, 6, 24)]
- Use latitude-longitude and map (polygon) infomation to assign each event in which borough or community
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.1-10, (SVN revision 622)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.4, released 2016/01/25
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.3/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.3/Resources/library/rgdal/proj
## Linking to sp version: 1.2-3
library(sp)
borough_wai <- readOGR("Borough_WAI", layer = "geo_export_e9b6dff4-dd56-4670-b398-e4efe9474a76")
## OGR data source with driver: ESRI Shapefile
## Source: "Borough_WAI", layer: "geo_export_e9b6dff4-dd56-4670-b398-e4efe9474a76"
## with 5 features
## It has 4 fields
community_wai <- readOGR("Community_WAI", layer = "geo_export_892906eb-17d4-4acd-86b4-33eb7f6d0a12")
## OGR data source with driver: ESRI Shapefile
## Source: "Community_WAI", layer: "geo_export_892906eb-17d4-4acd-86b4-33eb7f6d0a12"
## with 71 features
## It has 3 fields
# match polygon coordinate to coordinate of data
lng_lat <- mvc.xy[, c(3, 4)]
coordinates(lng_lat) <- c("LONGITUDE", "LATITUDE")
proj4string(lng_lat) <- proj4string(borough_wai)
# search events in the polygons
boro <- over(lng_lat, borough_wai)
commu <- over(lng_lat, community_wai)
# drop data containing NA and assign correct attributes to columns
mvc.xy$BOROUGH <- boro$boro_name
mvc.xy$COMMUNITY<- commu$boro_cd
mvc.xy <- mvc.xy[complete.cases(mvc.xy), ]
mvc.xy$DATE <- as.Date(mvc.xy$DATE, format = "%m/%d/%Y")
mvc.xy$COMMUNITY <- as.factor(mvc.xy$COMMUNITY)
- There are 275683 observations left after removing observations which do not have the information of latitude and longitude.
Plots
- Collisions incidents grouped by boroughs
library(ggplot2)
library(scales)
ggplot(mvc.xy, mapping = aes(BOROUGH)) + geom_bar()

- Collisions incidents by dates
g2 <- qplot(DATE, data = mvc.xy, geom = "histogram", binwidth = 10, facets = . ~ BOROUGH)
g2 + scale_x_date(breaks = date_breaks('4 months'), labels = date_format("%Y-%m")) +
theme(axis.text.x = element_text(angle=90))

- Collisions incidents on the map (use leaflet package) using default cluster option
library(leaflet)
library(rgdal)
library(sp)
#load borough shape file (https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm)
borough <- readOGR("Borough", layer = "geo_export_45eb8df1-3366-4833-a27e-dfb36076578d")
## OGR data source with driver: ESRI Shapefile
## Source: "Borough", layer: "geo_export_45eb8df1-3366-4833-a27e-dfb36076578d"
## with 5 features
## It has 4 fields
leaflet(mvc.xy) %>% addTiles() %>% addMarkers(clusterOptions = markerClusterOptions()) %>%
addPolygons(data = borough, fill = FALSE, stroke = TRUE, color = "darkred")
- Collisions incidents based on Borough
library(leaflet)
library(rgdal)
library(sp)
#load borough shape file (https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm)
borough <- readOGR("Borough", layer = "geo_export_45eb8df1-3366-4833-a27e-dfb36076578d")
## OGR data source with driver: ESRI Shapefile
## Source: "Borough", layer: "geo_export_45eb8df1-3366-4833-a27e-dfb36076578d"
## with 5 features
## It has 4 fields
#produce clusters according to boroguhs
attach(mvc.xy)
boro.count <- as.data.frame(table(mvc.xy$BOROUGH))
boro.count$longitude <- tapply(mvc.xy$LONGITUDE, BOROUGH, mean)
boro.count$latitude <- tapply(mvc.xy$LATITUDE, BOROUGH, mean)
#make figure based on boroguh
leaflet(boro.count) %>% addTiles() %>%
addCircles(radius = ~sqrt(Freq)*15, stroke = FALSE,
fillOpacity = 0.4, popup = ~as.character(Freq)) %>%
addPolygons(data = borough, fill = FALSE, stroke = TRUE, color = "black", weight = 1.5)
detach(mvc.xy)
- Collisions incidents based on community
library(leaflet)
library(rgdal)
library(sp)
#load community shape file (https://data.cityofnewyork.us/City-Government/Borough-Boundaries/tqmj-j8zm)
community <- readOGR("Community", layer = "geo_export_45a3484a-ea1e-4a8e-ab00-89356adabe6f")
## OGR data source with driver: ESRI Shapefile
## Source: "Community", layer: "geo_export_45a3484a-ea1e-4a8e-ab00-89356adabe6f"
## with 71 features
## It has 3 fields
#produce clusters according to community
attach(mvc.xy)
commu.count <- as.data.frame(table(mvc.xy$COMMUNITY))
commu.count$longitude <- tapply(mvc.xy$LONGITUDE, COMMUNITY, mean)
commu.count$latitude <- tapply(mvc.xy$LATITUDE, COMMUNITY, mean)
## make figure based on community
leaflet(commu.count) %>% addTiles() %>%
addCircles(radius = ~sqrt(Freq)*15, stroke = FALSE,
fillOpacity = 0.4, popup = ~as.character(Freq)) %>%
addPolygons(data = community, fill = FALSE, stroke = TRUE, color = "black", weight = 1)
detach(mvc.xy)